{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b82b7e3e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "env: CUDA_VISIBLE_DEVICES=0\n"
     ]
    }
   ],
   "source": [
    "%env CUDA_VISIBLE_DEVICES 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b1d746ab",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/monty/projects/pii-ner/utils/misc.py:38: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n",
      "  _seqeval_metric = load_metric(\"seqeval\")\n",
      "Using the latest cached version of the module from /home/monty/.cache/huggingface/modules/datasets_modules/metrics/seqeval/c8563af43bdce095d0f9e8b8b79c9c96d5ea5499b3bf66f90301c9cb82910f11 (last modified on Thu Feb 16 17:58:29 2023) since it couldn't be found locally at seqeval, or remotely on the Hugging Face Hub.\n",
      "Found cached dataset parquet (/home/monty/.cache/huggingface/datasets/bigcode___parquet/bigcode--pii-for-code-v2-e4dbf8dee82c409a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "25044ad684f94e9a9ecf63ff71785fd1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from tqdm.notebook import tqdm\n",
    "from utils import PiiNERPipeline\n",
    "from datasets import load_dataset\n",
    "\n",
    "pipeline = PiiNERPipeline('bigcode/bigcode-encoder-pii-ner',\n",
    "                          batch_size=32,\n",
    "                          window_size=512,\n",
    "                          device=0,\n",
    "                          num_workers=2,\n",
    "                          use_auth_token=True)\n",
    "dataset = load_dataset('bigcode/pii-for-code-v2', use_auth_token=True)['train']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f9edcd91",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e09ce82c178a451bb430605ebc52efb5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Detect PII entities:   0%|          | 0/400 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Too many dataloader workers: 2 (max is dataset.n_shards=1). Stopping 1 dataloader workers.\n",
      "To parallelize data loading, we give each process some shards (or data sources) to process. Therefore it's unnecessary to have a number of workers greater than dataset.n_shards=1. To enable more parallelism, please split the dataset in more files than 1.\n"
     ]
    }
   ],
   "source": [
    "iterator = pipeline(dataset)\n",
    "processed_dataset = list(tqdm(iterator, total=len(dataset), desc='Detect PII entities'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f7d13677",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'tag': 'NAME',\n",
       "  'start': 23,\n",
       "  'end': 36,\n",
       "  'value': 'Davis E. King',\n",
       "  'context': '// Copyright (C) 2003  Davis E. King (davis@dlib.net), Miguel Grinberg\\n// License: Boo',\n",
       "  'score': 0.9986383},\n",
       " {'tag': 'EMAIL',\n",
       "  'start': 38,\n",
       "  'end': 52,\n",
       "  'value': 'davis@dlib.net',\n",
       "  'context': '// Copyright (C) 2003  Davis E. King (davis@dlib.net), Miguel Grinberg\\n// License: Boost Software Lice',\n",
       "  'score': 0.9993752},\n",
       " {'tag': 'NAME',\n",
       "  'start': 55,\n",
       "  'end': 70,\n",
       "  'value': 'Miguel Grinberg',\n",
       "  'context': 'opyright (C) 2003  Davis E. King (davis@dlib.net), Miguel Grinberg\\n// License: Boost Software License   See LICENSE.',\n",
       "  'score': 0.99788713}]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed_dataset[0]['entities']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c57e563b",
   "metadata": {},
   "source": [
    "## Do some profiling here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ec990fea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting line_profiler\n",
      "  Downloading line_profiler-4.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (661 kB)\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m661.9/661.9 kB\u001B[0m \u001B[31m1.3 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\n",
      "\u001B[?25hInstalling collected packages: line_profiler\n",
      "Successfully installed line_profiler-4.0.3\n"
     ]
    }
   ],
   "source": [
    "!pip install line_profiler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "55dc1eee",
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext line_profiler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5907b6a8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "284c2b46c4904bfe92c00320466e0731",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Detect PII entities:   0%|          | 0/400 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Too many dataloader workers: 2 (max is dataset.n_shards=1). Stopping 1 dataloader workers.\n",
      "To parallelize data loading, we give each process some shards (or data sources) to process. Therefore it's unnecessary to have a number of workers greater than dataset.n_shards=1. To enable more parallelism, please split the dataset in more files than 1.\n"
     ]
    }
   ],
   "source": [
    "%lprun -f pipeline._predict_iterator list(tqdm( pipeline(dataset), total=len(dataset), desc='Detect PII entities'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b1223f5e",
   "metadata": {},
   "source": [
    "    Timer unit: 1e-09 s\n",
    "\n",
    "    Total time: 289.552 s\n",
    "    File: pii-ner/utils/pipeline.py\n",
    "    Function: _predict_iterator at line 159\n",
    "\n",
    "    Line #      Hits         Time  Per Hit   % Time  Line Contents\n",
    "    ==============================================================\n",
    "       159                                               def _predict_iterator(self, inputs: Dataset, batch_size: int):\n",
    "       160         1     382949.0 382949.0      0.0          loader = self._get_pipeline_dataloader(inputs, self.tokenizer,\n",
    "       161         1        214.0    214.0      0.0                                                 batch_size=batch_size,\n",
    "       162         1        427.0    427.0      0.0                                                 window_size=self.window_size,\n",
    "       163         1        242.0    242.0      0.0                                                 window_overlap=self.window_overlap,\n",
    "       164         1        247.0    247.0      0.0                                                 num_workers=self.num_workers)\n",
    "       165                                           \n",
    "       166         1 1902225579.0 1902225579.0      0.7          self.model.to(self.device)\n",
    "       167                                           \n",
    "       168         1       2833.0   2833.0      0.0          processing_iterator = self.process_inputs(loader)\n",
    "       169       400 283532077631.0 708830194.1     97.9          for processed in self.combine_chunked_inputs(processing_iterator):\n",
    "       170       400 4116858091.0 10292145.2      1.4              yield self.extract_entities(text=processed['text'],\n",
    "       171       400     157521.0    393.8      0.0                                          logits=processed['logits'],\n",
    "       172       400     162483.0    406.2      0.0                                          offset_mapping=processed['offset_mapping'])\n",
    "       173                                                   self.model.to('cpu')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "c4212a7c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "547930f1d29c4376bf0ab781edb8fa4f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Detect PII entities:   0%|          | 0/20 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from utils.pipeline import PipelineIterator\n",
    "pipeline.num_workers = 0\n",
    "%lprun -f pipeline.forward list(tqdm( pipeline(dataset.from_list(list(dataset)[:20])), total=20, desc='Detect PII entities'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "87089494",
   "metadata": {},
   "source": [
    "    Timer unit: 1e-09 s\n",
    "\n",
    "    Total time: 12.1679 s\n",
    "    File: pii-ner/utils/pipeline.py\n",
    "    Function: forward at line 100\n",
    "\n",
    "    Line #      Hits         Time  Per Hit   % Time  Line Contents\n",
    "    ==============================================================\n",
    "       100                                               def forward(self, model_inputs, **forward_params):\n",
    "       101         7     376197.0  53742.4      0.0          with self.device_placement():\n",
    "       102         7     585588.0  83655.4      0.0              inference_context = self.get_inference_context()\n",
    "       103         7     142410.0  20344.3      0.0              with inference_context():\n",
    "       104         7 5039463571.0 719923367.3     41.4                  model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)\n",
    "       105         7   86368434.0 12338347.7      0.7                  model_outputs = self._forward(model_inputs, **forward_params)\n",
    "       106         7 7040784741.0 1005826391.6     57.9                  model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device(\"cpu\"))\n",
    "       107         7     211824.0  30260.6      0.0                  model_outputs = {name: tensor.numpy() if isinstance(tensor, torch.Tensor) else tensor\n",
    "       108         7       6737.0    962.4      0.0                                   for name, tensor in model_outputs.items()}\n",
    "       109                                           \n",
    "       110         7       1324.0    189.1      0.0          return model_outputs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "33a72a0a",
   "metadata": {},
   "source": [
    "### 5 seconds out 12 (41% of time) consumed on transfering from cpu to cuda \n",
    "### 7 seconds out 12 (58% of time) consumed on transfering from cuda to cpu \n",
    "### so, most of the time consumption is just a bus bottleneck =/"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
